library(fitdistrplus)
library(foreign)

get.pctile.inc.bounds.by.dist <- function(the.data, pctiles=c(20,40,60,80,95), n.obs=100) {
  #data$inc.point <- with(data, inc.ll + (inc.ul - inc.ll)/2)
  fake.incs <- c()
  pc.remaining <- 100
  for (row in seq(1, nrow(the.data))) {
    #fake.obs <- rep(data$inc.point[row], 10 * data$inc.pc[row])
    pop.pc <- the.data$pop_pc[row]
    pc.remaining <- pc.remaining - pop.pc
    fake.obs <- runif(n.obs * pop.pc, min=the.data$inc_ll[row], max=the.data$inc_ul[row])
    fake.incs <- append(fake.incs, fake.obs)
  }
  fake.incs.left <- fake.incs
  fake.incs.left[which(fake.incs.left == 0)] <- NA
  fake.incs.left <- append(fake.incs.left, rep(max(the.data$inc_ul), times=n.obs*pc.remaining))
  fake.incs.right <- fake.incs
  fake.incs.right <- append(fake.incs.right, rep(NA, times=n.obs*pc.remaining))
  fake.incs.cens <- data.frame(left=fake.incs.left, right=fake.incs.right)
#  return(fake.incs.cens)
#}
  #dist.fit <- fitdist(log(fake.incs), "norm")
  dist.fit <- fitdistcens(fake.incs.cens, "lnorm")
  dist.mean <- dist.fit$estimate[1]
  dist.sd <- dist.fit$estimate[2]
  return(qlnorm(pctiles/100,
                meanlog=dist.mean,
                sdlog=dist.sd))
}


get.pctile.inc.bounds.by.dist2 <- function(the.data, pctiles=c(20,40,60,80,95), n.obs=100) {
  fake.incs.left <- c()
  fake.incs.right <- c()
  pc.remaining <- 100
  for (row in seq(1, nrow(the.data))) {
    pop.pc          <- the.data$pop_pc[row]
    pc.remaining    <- pc.remaining - pop.pc
    fake.obs.left   <- rep(the.data$inc_ll[row], times=n.obs*pop.pc)
    fake.obs.right  <- rep(the.data$inc_ul[row], times=n.obs*pop.pc)
    fake.incs.left  <- append(fake.incs.left, fake.obs.left)
    fake.incs.right <- append(fake.incs.right, fake.obs.right)
  }
  # There the.data has no row for the final right-censored part of the distribution
  # so we generate an appropriate proportion/number of right-censored observations.
  fake.incs.left  <- append(fake.incs.left, rep(max(the.data$inc_ul), times=n.obs*pc.remaining))
  fake.incs.right <- append(fake.incs.right, rep(NA, times=n.obs*pc.remaining))
  # Correctly left-censor the observations.
  min.inc.cens <- min(the.data$inc_ll)
  fake.incs.left[which(fake.incs.left == min.inc.cens)] <- NA
  fake.incs.right[which(fake.incs.right == min.inc.cens)] <- 1
  fake.incs.cens <- data.frame(left=fake.incs.left, right=fake.incs.right)
  dist.fit <- fitdistcens(fake.incs.cens, "lnorm")
  dist.mean <- dist.fit$estimate[1]
  dist.sd <- dist.fit$estimate[2]
  return(qlnorm(pctiles/100,
                meanlog=dist.mean,
                sdlog=dist.sd))
}


get.pctile.inc.bounds <- function(the.data, pctiles=c(20,40,60,80,95)) {
  cum.pctiles <- c()
  cum.pc <- 0
  for (row in seq(1, nrow(the.data))) {
    cum.pc <- cum.pc + the.data$pop_pc[row]
    cum.pctiles <- append(cum.pctiles, cum.pc)
  }
  inc.bounds <- c()
  for (pctile in pctiles) {
    idxs <- which(cum.pctiles <= pctile)
    if (length(idxs) == 0) {
      #idx.lower <- 1
      pc.remaining <- pctile
      band.proportion <- pc.remaining / the.data$pop_pc[1]
      bound <- the.data$inc_ll[1] + band.proportion*(the.data$inc_ul[1] - the.data$inc_ll[1])
    }
    else {
      idx.lower <- max(idxs)
      pc.remaining <- pctile - cum.pctiles[idx.lower]
      band.proportion <- pc.remaining / the.data$pop_pc[idx.lower+1]
      if (is.na(band.proportion)) {
        ## the.data$pop_pc[idx.lower+1] is not present because the final income
        ## band is not large enough to capture our top percentile(s) request.
        ## The hacky solution is to simply use the bottom threshold of the top income
        ## band and hope that it is not too far out.
        #bound <- the.data$inc_ul[idx.lower]
        
        ## Calculate the slope of the distribution between the current two upper and lower bounds.
        ## Extrapolate the slope forward as far into the remaining band as necessary
        ## to get to the requested pctile, on the (approximating) assumption that
        ## the income distribution continues along the same slope.
        slope <- (the.data$inc_ll[idx.lower] - the.data$inc_ul[idx.lower]) / the.data$pop_pc[idx.lower]
        extra.inc <- pctile - the.data$inc_ul[idx.lower]
        bound <- the.data$inc_ul[idx.lower] + extra.inc / slope
      }
      else if (band.proportion == 0) {
        bound <- the.data$inc_ul[idx.lower]
      }
      else {
        bound <- the.data$inc_ul[idx.lower] + band.proportion*(the.data$inc_ul[idx.lower+1] - the.data$inc_ul[idx.lower])
      }
    }
    inc.bounds <- append(inc.bounds, bound)
  }
  return(inc.bounds)
}


# Some test data to play with
canada.1963 <- data.frame(inc.ll=c(0, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 8000, 10000),#, 15000),
                          inc.ul=c(1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 8000, 10000, 15000),#, 25000),
                          pop.pc=c(8, 5.3, 4.6, 5.7, 5, 6.1, 6.3, 8.2, 5.8, 8.6, 5.4, 5.6, 4.5, 7.2, 4.4, 7)#, 2.3)
                          )

bartels.1947 <- data.frame(inc.ll=c(0, 13951.872, 22513.248, 30528.528, 43317.744),
                           inc.ul=c(13951.872, 22513.248, 30528.528, 43317.744, 71098.176),
                           pop.pc=c(20, 20, 20, 20, 15)#, 5)
                           )


